First, let’s understand how different types of spatial data is handled in R, beginning with vector data (i.e. points, lines and polygons) and ending with a brief raster example.
We will begin by looking at polyogn data. Let’s look at Philadelphia census tracts as an example. Census shapefiles are free to download on the Census website, but we will load a version that has already been converted to an sf object.
library(sf)
# Load philly tracts data
pt_sf <- readRDS("data/philadelphia_tracts.rds")
# Note philly.tracts is an sf ("simple feature") object of type "MULTIPOLYGON"
class(pt_sf) ## [1] "sf" "data.frame"
## Classes 'sf' and 'data.frame': 384 obs. of 15 variables:
## $ OBJECTID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ STATEFP10 : Factor w/ 1 level "42": 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTYFP10: Factor w/ 1 level "101": 1 1 1 1 1 1 1 1 1 1 ...
## $ TRACTCE10 : Factor w/ 384 levels "000100","000200",..: 95 96 97 134 135 136 137 138 139 140 ...
## $ GEOID10 : Factor w/ 384 levels "42101000100",..: 95 96 97 134 135 136 137 138 139 140 ...
## $ NAME10 : Factor w/ 384 levels "1","10.01","10.02",..: 369 370 371 43 44 46 47 48 49 50 ...
## $ NAMELSAD10: Factor w/ 384 levels "Census Tract 1",..: 369 370 371 43 44 46 47 48 49 50 ...
## $ MTFCC10 : Factor w/ 1 level "G5020": 1 1 1 1 1 1 1 1 1 1 ...
## $ FUNCSTAT10: Factor w/ 1 level "S": 1 1 1 1 1 1 1 1 1 1 ...
## $ ALAND10 : int 366717 319070 405273 341256 562934 439802 562132 789935 570015 609439 ...
## $ AWATER10 : int 0 0 0 0 0 0 0 277434 282808 0 ...
## $ INTPTLAT10: Factor w/ 384 levels "+39.8798897",..: 106 114 113 141 137 132 126 110 120 131 ...
## $ INTPTLON10: Factor w/ 384 levels "-074.9667387",..: 350 362 370 257 234 208 168 129 113 135 ...
## $ LOGRECNO : Factor w/ 384 levels "10335","10336",..: 95 96 97 134 135 136 137 138 139 140 ...
## $ geometry :sfc_MULTIPOLYGON of length 384; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:36, 1:2] -75.2 -75.2 -75.2 -75.2 -75.2 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA NA NA NA NA NA ...
## ..- attr(*, "names")= chr [1:14] "OBJECTID" "STATEFP10" "COUNTYFP10" "TRACTCE10" ...
## Simple feature collection with 6 features and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.24747 ymin: 39.96047 xmax: -75.15853 ymax: 39.98
## Geodetic CRS: WGS 84
## OBJECTID STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10
## 0 1 42 101 009400 42101009400 94 Census Tract 94
## 1 2 42 101 009500 42101009500 95 Census Tract 95
## 2 3 42 101 009600 42101009600 96 Census Tract 96
## 3 4 42 101 013800 42101013800 138 Census Tract 138
## 4 5 42 101 013900 42101013900 139 Census Tract 139
## 5 6 42 101 014000 42101014000 140 Census Tract 140
## MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 LOGRECNO
## 0 G5020 S 366717 0 +39.9632709 -075.2322437 10429
## 1 G5020 S 319070 0 +39.9658709 -075.2379140 10430
## 2 G5020 S 405273 0 +39.9655396 -075.2435075 10431
## 3 G5020 S 341256 0 +39.9764504 -075.1771771 10468
## 4 G5020 S 562934 0 +39.9750563 -075.1711846 10469
## 5 G5020 S 439802 0 +39.9735358 -075.1630966 10470
## geometry
## 0 MULTIPOLYGON (((-75.22927 3...
## 1 MULTIPOLYGON (((-75.23536 3...
## 2 MULTIPOLYGON (((-75.24343 3...
## 3 MULTIPOLYGON (((-75.17341 3...
## 4 MULTIPOLYGON (((-75.17313 3...
## 5 MULTIPOLYGON (((-75.16141 3...
## [1] 384 15
## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.23681 ymin: 39.96047 xmax: -75.22768 ymax: 39.96609
## Geodetic CRS: WGS 84
## OBJECTID STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10
## 0 1 42 101 009400 42101009400 94 Census Tract 94
## MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 LOGRECNO
## 0 G5020 S 366717 0 +39.9632709 -075.2322437 10429
## geometry
## 0 MULTIPOLYGON (((-75.22927 3...
## [1] Census Tract 94 Census Tract 95 Census Tract 96 Census Tract 138
## [5] Census Tract 139 Census Tract 140
## 384 Levels: Census Tract 1 Census Tract 10.01 ... Census Tract 9891
## Simple feature collection with 6 features and 1 field
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.24747 ymin: 39.96047 xmax: -75.15853 ymax: 39.98
## Geodetic CRS: WGS 84
## NAMELSAD10 geometry
## 0 Census Tract 94 MULTIPOLYGON (((-75.22927 3...
## 1 Census Tract 95 MULTIPOLYGON (((-75.23536 3...
## 2 Census Tract 96 MULTIPOLYGON (((-75.24343 3...
## 3 Census Tract 138 MULTIPOLYGON (((-75.17341 3...
## 4 Census Tract 139 MULTIPOLYGON (((-75.17313 3...
## 5 Census Tract 140 MULTIPOLYGON (((-75.16141 3...
# We can extract the geometry of philly.tracts with the st_geometry function
pt_geo <- st_geometry(pt_sf)
pt_geo## Geometry set for 384 features
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.28031 ymin: 39.86747 xmax: -74.95575 ymax: 40.13793
## Geodetic CRS: WGS 84
## First 5 geometries:
pt_geo[[1]] # perimeter coordinates for the first census tract of the sf
pt_sf[1,] # i.e. Census Tract 94## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.23681 ymin: 39.96047 xmax: -75.22768 ymax: 39.96609
## Geodetic CRS: WGS 84
## OBJECTID STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10
## 0 1 42 101 009400 42101009400 94 Census Tract 94
## MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 LOGRECNO
## 0 G5020 S 366717 0 +39.9632709 -075.2322437 10429
## geometry
## 0 MULTIPOLYGON (((-75.22927 3...
pt_geo[[2]] # perimeter coordinates for the second census tract of the sf
pt_sf[2,] # i.e. Census Tract 95## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -75.24076 ymin: 39.96148 xmax: -75.23535 ymax: 39.97035
## Geodetic CRS: WGS 84
## OBJECTID STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10 NAMELSAD10
## 1 2 42 101 009500 42101009500 95 Census Tract 95
## MTFCC10 FUNCSTAT10 ALAND10 AWATER10 INTPTLAT10 INTPTLON10 LOGRECNO
## 1 G5020 S 319070 0 +39.9658709 -075.2379140 10430
## geometry
## 1 MULTIPOLYGON (((-75.23536 3...
# The base plot function has some aesthetic options we can use to tweak our plots
plot(pt_geo, col = "lemonchiffon2")Next let’s look at an example of line data: streets in Philadelphia with bicycle access. This data was sourced directly from the Philadelphia Bike Network.
## Reading layer `Bike_Network' from data source
## `/Users/tomasoles/Library/CloudStorage/Dropbox/Teaching/AppliedDataAnalysis/applied_data_analysis/3_week/data/Bike_Network/Bike_Network.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 5101 features and 8 fields
## Geometry type: LINESTRING
## Dimension: XY
## Bounding box: xmin: -75.26927 ymin: 39.87651 xmax: -74.96572 ymax: 40.124
## Geodetic CRS: WGS 84
## [1] "sf" "data.frame"
# Once again, let's access the spatial attributes of this sf object with the st_geometry command.
bn_geo <- st_geometry(bn_sf)
bn_geo[[2]] # line segment 2
bn_sf[2,]## Simple feature collection with 1 feature and 8 fields
## Geometry type: LINESTRING
## Dimension: XY
## Bounding box: xmin: -75.25337 ymin: 39.88161 xmax: -75.25149 ymax: 39.88373
## Geodetic CRS: WGS 84
## OBJECTID SEG_ID STREETNAME ST_CODE ONEWAY CLASS TYPE Shape__Len
## 2 2 100003 BARTRAM AVE 16120 B 2 Conventional 371.888
## geometry
## 2 LINESTRING (-75.25149 39.88...
As an example of point data, we will work with crime incidents that occurred in Philadelphia in September 2018. The full publicly available crime incidents database for Philadelphia is maintained by the Philadelphia Police Department and is available on the OpenDataPhilly website.
## # A tibble: 6 × 18
## the_geom cartodb_id the_geom_webmercator objectid dc_dist psa
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 <NA> 3096883 <NA> 15611163 15 3
## 2 <NA> 3097416 <NA> 15854864 9 3
## 3 <NA> 3097550 <NA> 16187498 9 5
## 4 <NA> 3101729 <NA> 14641739 9 3
## 5 <NA> 3102256 <NA> 15982976 7 3
## 6 0101000020E610000031D2… 3103180 0101000020110F00006… 14989348 9 2
## # ℹ 12 more variables: dispatch_date_time <dttm>, dispatch_date <chr>,
## # dispatch_time <time>, hour <dbl>, dc_key <dbl>, location_block <chr>,
## # ucr_general <dbl>, text_general_code <chr>, point_x <dbl>, point_y <dbl>,
## # lat <dbl>, lng <dbl>
crime_aux <- crime_aux[which(!is.na(crime_aux$lat) & !is.na(crime_aux$lng)),]
crime <- st_as_sf(crime_aux, coords = c("lng","lat"))
# The crime data is an sf object of type POINT and includes information on the date, time and offense type for each incident
class(crime)## [1] "sf" "tbl_df" "tbl" "data.frame"
## Simple feature collection with 6 features and 16 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: -75.21435 ymin: 39.94692 xmax: -75.05671 ymax: 40.06165
## CRS: NA
## # A tibble: 6 × 17
## the_geom cartodb_id the_geom_webmercator objectid dc_dist psa
## <chr> <dbl> <chr> <dbl> <dbl> <chr>
## 1 0101000020E610000031D2… 3103180 0101000020110F00006… 14989348 9 2
## 2 0101000020E6100000FDD7… 3120696 0101000020110F0000E… 13220352 35 3
## 3 0101000020E6100000A869… 3123784 0101000020110F00007… 17337433 9 4
## 4 0101000020E6100000AB40… 3123917 0101000020110F0000A… 16519896 2 3
## 5 0101000020E6100000AB40… 3124050 0101000020110F0000A… 14864200 2 3
## 6 0101000020E6100000B79E… 3127464 0101000020110F0000F… 15167770 5 1
## # ℹ 11 more variables: dispatch_date_time <dttm>, dispatch_date <chr>,
## # dispatch_time <time>, hour <dbl>, dc_key <dbl>, location_block <chr>,
## # ucr_general <dbl>, text_general_code <chr>, point_x <dbl>, point_y <dbl>,
## # geometry <POINT>
# Let's take a look at offense types and use dplyr to filter by offense_type...
table(crime$text_general_code)##
## Aggravated Assault Firearm Aggravated Assault No Firearm
## 1176 2305
## All Other Offenses Arson
## 3309 173
## Burglary Non-Residential Burglary Residential
## 659 1310
## Disorderly Conduct DRIVING UNDER THE INFLUENCE
## 88 211
## Embezzlement Forgery and Counterfeiting
## 58 41
## Fraud Gambling Violations
## 1684 6
## Homicide - Criminal Homicide - Justifiable
## 178 1
## Liquor Law Violations Motor Vehicle Theft
## 16 2430
## Narcotic / Drug Law Violations Offenses Against Family and Children
## 855 94
## Other Assaults Other Sex Offenses (Not Commercialized)
## 5923 206
## Prostitution and Commercialized Vice Public Drunkenness
## 63 3
## Rape Receiving Stolen Property
## 202 292
## Robbery Firearm Robbery No Firearm
## 728 1133
## Theft from Vehicle Thefts
## 4239 17354
## Vagrancy/Loitering Vandalism/Criminal Mischief
## 5 2936
## Weapon Violations
## 656
homicide <- filter(crime, text_general_code == "Homicide - Criminal")
fraud <- filter(crime, text_general_code == "Fraud")
# Note subsets of an sf object are also sf objects
class(homicide)## [1] "sf" "tbl_df" "tbl" "data.frame"
## [1] "sf" "tbl_df" "tbl" "data.frame"
# Points by themselves are not very easy to understand. Let's layer them on top of the tract polygons with add = TRUE
plot(pt_geo)
plot(st_geometry(fraud), col = "blue", alpha = 0.1, add = TRUE)
plot(st_geometry(homicide), col = "red", add = TRUE)
legend("bottomright", legend = c("Fraud", "Homicide"), title = "Offense type:", col = c("blue", "red"), pch = 1, bty = "n")So far, we have considered point, line, and polygon data, all of
which fall under the umbrella of vector data types. Rasters are a
distinct GIS data type that we will consider only briefly because they
cannot be handled with sf methods. We will look at the
volcano dataset, which gives topographic information for
Maunga Whau (a volcano located in Auckland, New Zealand) on a 10m by 10m
grid. Because it is a relatively small raster, we can handle
volcano using base functions. Larger rasters should be
handled using the raster package.
## [1] "matrix" "array"
## num [1:87, 1:61] 100 101 102 103 104 105 105 106 107 108 ...
We will use the example of New York State county-aggregated Lyme disease incidence for 2014-2016 to try our hand at spatial analysis. This data is publicly available and can be accessed at the Health Data NY website. Raw data can be downloaded in .csv format. If you’re curious to see how this tabular data can be merged with a New York State county shapefile (available at NYS GIS), you can see how this is done in the ‘prep_nys_lyme_data.R’ script file located in the ‘data’ folder of our project directory. But for now, we’ll start with this data that has already been merged and converted to an ‘sf’ object.
library(sf)
# Load NYS Lyme incidence data
lyme <- readRDS("data/nys_lyme_data.rds")
# Let's take a look at some data attributes
class(lyme)## [1] "sf" "data.frame"
## Simple feature collection with 6 features and 8 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 163621.8 ymin: 4515576 xmax: 608219.1 ymax: 4853641
## Projected CRS: NAD83 / UTM zone 18N
## NAME FIPS_CODE Health.Topic
## 1 Albany 36001 Communicable Disease Indicators
## 2 Allegany 36003 Communicable Disease Indicators
## 3 Bronx 36005 Communicable Disease Indicators
## 4 Broome 36007 Communicable Disease Indicators
## 5 Cattaraugus 36009 Communicable Disease Indicators
## 6 Cayuga 36011 Communicable Disease Indicators
## Indicator Measure.Unit Lyme.Incidence.Rate
## 1 Lyme disease incidence per 100,000 Rate 71.5
## 2 Lyme disease incidence per 100,000 Rate 7.7
## 3 Lyme disease incidence per 100,000 Rate 3.4
## 4 Lyme disease incidence per 100,000 Rate 109.8
## 5 Lyme disease incidence per 100,000 Rate 12.4
## 6 Lyme disease incidence per 100,000 Rate 10.6
## Data.Years Data.Source
## 1 2014-2016 Bureau of Communicable Disease Control Data as of May, 2018
## 2 2014-2016 Bureau of Communicable Disease Control Data as of May, 2018
## 3 2014-2016 Bureau of Communicable Disease Control Data as of May, 2018
## 4 2014-2016 Bureau of Communicable Disease Control Data as of May, 2018
## 5 2014-2016 Bureau of Communicable Disease Control Data as of May, 2018
## 6 2014-2016 Bureau of Communicable Disease Control Data as of May, 2018
## geometry
## 1 MULTIPOLYGON (((605729 4737...
## 2 MULTIPOLYGON (((229573.9 47...
## 3 MULTIPOLYGON (((595540.7 45...
## 4 MULTIPOLYGON (((428899.3 46...
## 5 MULTIPOLYGON (((169747.3 47...
## 6 MULTIPOLYGON (((369644.2 47...
# Note that the variable Lyme.Incidence.Rate gives the county-level Lyme disease incidence per 100,000 population
# Once again, we can plot the spatial attrbutes of this data
plot(st_geometry(lyme))This data is an example of regional rate data. An easy way to map this data is using the ‘tmap’ library. Let’s load ‘tmap’ and create an interactive map with just a few lines of code.
library(tmap)
tmap_mode("view") # set mode to interactive
tm_shape(lyme) + # specify sf object with geographic attribute of interest
tm_polygons("Lyme.Incidence.Rate") # specify column with value of interestWe have a missing value in St. Lawrence county. Let’s remove this row from the data so it doesn’t throw an error later in our analysis.
This section was adapted from a tutorial created by Manuel Gimmond, which can be found on his github page.
Let’s begin by looking at the distribution of our Lyme incidence rate data. The Moran’s I statistic is not robust to extreme values or outliers so we will need to transform our data if it deviates greatly from a normal distribution.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.90 13.50 36.30 80.27 98.50 599.60
Our data is skewed strongly to the right with lots of outliers much greater than the mean. Let’s see if a log transformation can make our data look more normal.
# Create new variable that is the log-transformed incidence rate
lyme$log_lyme_incidence <- log(lyme$Lyme.Incidence.Rate)
# Histogram
hist(lyme$log_lyme_incidence)## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6419 2.6027 3.5918 3.5974 4.5901 6.3963
That’s much better! We can see what our log-transformed values look like on a map.
Now we’re ready to begin our analysis. The first step is to define “neighboring” polygons. Recall that neighbors can be defined based on contiguity or distance or as the k nearest neighbors to each polygon. We’ll use a queen-case contiguity-based definition, where any contiguous polygon that shares at least one vertex will be considered a neighbor. We can store the neighbors of each one of our polygons by creating an ‘nb’ object using the ‘poly2nb’ function from the ‘spdep’ library.
library(spdep)
# Create nb object from Lyme dataset
lyme_nb <- poly2nb(lyme, queen = T) # queen case
class(lyme_nb)## [1] "nb"
## List of 61
## $ : int [1:6] 11 20 42 45 46 47
## $ : int [1:4] 5 26 50 60
## $ : int [1:4] 30 31 41 59
## $ : int [1:4] 9 12 13 53
## $ : int [1:4] 2 7 15 60
## $ : int [1:7] 12 23 34 38 49 54 58
## $ : int [1:2] 5 15
## $ : int [1:4] 48 50 53 54
## $ : int [1:5] 4 12 13 27 39
## $ : int [1:2] 16 17
## $ : int [1:5] 1 14 20 42 55
## $ : int [1:7] 4 6 9 27 34 53 54
## $ : int [1:7] 4 9 20 39 47 52 55
## $ : int [1:4] 11 36 40 55
## $ : int [1:5] 5 7 19 32 60
## $ : int [1:5] 10 17 21 56 57
## $ : int [1:3] 10 16 21
## $ : int [1:4] 21 22 29 45
## $ : int [1:6] 15 26 28 32 37 60
## $ : int [1:6] 1 11 13 42 47 55
## $ : int [1:6] 16 17 18 22 45 56
## $ : int [1:6] 18 21 25 29 33 39
## $ : int [1:3] 6 25 38
## $ : int [1:3] 31 41 43
## $ : int [1:4] 22 23 33 38
## $ : int [1:6] 2 19 28 35 50 60
## $ : int [1:6] 9 12 33 34 38 39
## $ : int [1:5] 19 26 35 37 58
## $ : int [1:6] 18 22 39 45 46 47
## $ : int [1:4] 3 41 51 59
## $ : int [1:3] 3 24 41
## $ : int [1:3] 15 19 37
## $ : int [1:5] 22 25 27 38 39
## $ : int [1:4] 6 12 27 38
## $ : int [1:6] 26 28 49 50 58 61
## $ : int [1:5] 14 40 44 52 55
## $ : int [1:3] 19 28 32
## $ : int [1:6] 6 23 25 27 33 34
## $ : int [1:7] 9 13 22 27 29 33 47
## $ : int [1:4] 14 36 44 59
## $ : int [1:5] 3 24 30 31 43
## $ : int [1:5] 1 11 20 45 57
## $ : int [1:2] 24 41
## $ : int [1:3] 36 40 59
## $ : int [1:8] 1 18 21 29 42 46 56 57
## $ : int [1:4] 1 29 45 47
## $ : int [1:6] 1 13 20 29 39 46
## $ : int [1:5] 8 49 50 54 61
## $ : int [1:6] 6 35 48 54 58 61
## $ : int [1:6] 2 8 26 35 48 61
## $ : int 30
## $ : int [1:3] 13 36 55
## $ : int [1:4] 4 8 12 54
## $ : int [1:6] 6 8 12 48 49 53
## $ : int [1:6] 11 13 14 20 36 52
## $ : int [1:4] 16 21 45 57
## $ : int [1:4] 16 42 45 56
## $ : int [1:4] 6 28 35 49
## $ : int [1:4] 3 30 40 44
## $ : int [1:5] 2 5 15 19 26
## $ : int [1:4] 35 48 49 50
## - attr(*, "class")= chr "nb"
## - attr(*, "region.id")= chr [1:61] "1" "2" "3" "4" ...
## - attr(*, "call")= language poly2nb(pl = lyme, queen = T)
## - attr(*, "type")= chr "queen"
## - attr(*, "sym")= logi TRUE
## [1] 11 20 42 45 46 47
## [1] "Albany"
## [1] "Columbia" "Greene" "Rensselaer" "Saratoga" "Schenectady"
## [6] "Schoharie"
Next, we’ll assign weights to each neighboring polygon. We’ll use the simplest option (’style=“W”), which assigns equal weight to each neighboring polygon. In other words, the weight applied to the neigbors of a polygon will equal 1/(no. of neighbors for that polygon).
# Calculate weights from nb object, we'll specify style = "W" for equal weights
lyme_weights <- nb2listw(lyme_nb, style = "W")
class(lyme_weights)## [1] "listw" "nb"
# View the weight of the first polygon's neighbors
str(lyme_weights, max.level = 1) # view the structure of lw, we'll set max.level = 1 for easier viewing## List of 3
## $ style : chr "W"
## $ neighbours:List of 61
## ..- attr(*, "class")= chr "nb"
## ..- attr(*, "region.id")= chr [1:61] "1" "2" "3" "4" ...
## ..- attr(*, "call")= language poly2nb(pl = lyme, queen = T)
## ..- attr(*, "type")= chr "queen"
## ..- attr(*, "sym")= logi TRUE
## $ weights :List of 61
## ..- attr(*, "mode")= chr "binary"
## ..- attr(*, "W")= logi TRUE
## ..- attr(*, "comp")=List of 1
## - attr(*, "class")= chr [1:2] "listw" "nb"
## - attr(*, "region.id")= chr [1:61] "1" "2" "3" "4" ...
## - attr(*, "call")= language nb2listw(neighbours = lyme_nb, style = "W")
## - attr(*, "zero.policy")= logi FALSE
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
## [1] "Allegany"
## [1] 5 26 50 60
## [1] 0.25 0.25 0.25 0.25
Now we can calculate the Moran’s I statistic and perform hypothesis testing using ‘moran.test’ (analytical calculation) and ‘moran.mc’ (via Monte Carlo simulations). These functions require that we specify the variable of interest and the list of neighbor weights for each polygon. The option ‘alternative = “greater”’ specifies testing for positive spatial autocorrelation, which is also the default for these functions. The ‘moran.mc’ function also requires that we specify the number of simulations with option ‘nsim’.
# Analytical test - quicker computation but sensitive to irregularly distributed polygons
moran.test(lyme$log_lyme_incidence, lyme_weights, alternative = "two.sided")##
## Moran I test under randomisation
##
## data: lyme$log_lyme_incidence
## weights: lyme_weights
##
## Moran I statistic standard deviate = 8.7379, p-value < 2.2e-16
## alternative hypothesis: two.sided
## sample estimates:
## Moran I statistic Expectation Variance
## 0.720555645 -0.016666667 0.007118479
# Monte Carlo (MC) simulation is slower but the preferred method to calculate an accurate p-value
MC <- moran.mc(lyme$log_lyme_incidence, lyme_weights, nsim = 999, alternative = "greater")
MC##
## Monte-Carlo simulation of Moran I
##
## data: lyme$log_lyme_incidence
## weights: lyme_weights
## number of simulations + 1: 1000
##
## statistic = 0.72056, observed rank = 1000, p-value = 0.001
## alternative hypothesis: greater
Next, we’ll have R compute the average neighbor income value for each polygon. These values are often referred to as spatially lagged values.
## [1] 5.279184 2.322263 2.528599 4.374581 1.594856 3.496443 1.926349 4.496455
## [9] 4.185537 4.021645 5.403306 3.925311 5.059918 5.668410 1.717511 3.812259
## [17] 3.896203 3.819425 1.544629 5.137699 3.989122 3.653277 3.248488 2.564676
## [25] 3.600780 2.600797 3.843967 2.163104 4.196165 2.510087 1.763183 1.827523
## [33] 3.858524 3.226352 2.892245 5.120626 1.877202 3.169335 3.897496 4.497368
## [41] 2.376910 5.287309 2.032887 4.765607 4.179317 4.232511 4.645199 4.015905
## [49] 3.583862 3.203023 2.079442 5.010065 4.314726 3.778138 5.388543 4.410235
## [57] 4.868271 2.727233 3.360805 1.890025 3.590111
plot(log_lyme_incidence.lag ~ lyme$log_lyme_incidence, pch=16, asp=1)
M1 <- lm(log_lyme_incidence.lag ~ lyme$log_lyme_incidence)
summary(M1)##
## Call:
## lm(formula = log_lyme_incidence.lag ~ lyme$log_lyme_incidence)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.63142 -0.40758 0.02266 0.46627 1.21356
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.98907 0.22825 4.333 5.8e-05 ***
## lyme$log_lyme_incidence 0.72056 0.05952 12.106 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6175 on 59 degrees of freedom
## Multiple R-squared: 0.713, Adjusted R-squared: 0.7081
## F-statistic: 146.5 on 1 and 59 DF, p-value: < 2.2e-16
## lyme$log_lyme_incidence
## 0.7205556
We can see the results of the MC simulations graphically by passing the output of MC model to the ‘plot’ function.
The local Moran statistic is an extension of the Moran’s I for the analysis of local (rather than global) spatial autocorrelation. There are some steps in common with the global clustering analysis we performed previously (for example, we have to calculate neighbor weights again) but there are key differences, due to particularities of the ‘rgeoda’ library.
We will once again find queen-case contiguous weights, though in ‘rgeoda’ we do this with the ‘queen_weights’ function. Note instead of a list of weights like we saw previously with the ‘nb2listw’ function, ‘queen_weights’ outputs an ‘rgeoda’ ‘Weight’ object, which has some nice features.
library(rgeoda)
# Find queen-case contiguous weights
lyme_gweights <- queen_weights(lyme)
class(lyme_gweights)## [1] "Weight"
## attr(,"package")
## [1] "rgeoda"
## Reference class 'Weight' [package "rgeoda"] with 9 fields
## $ gda_w :Formal class 'p_GeoDaWeight' [package "rgeoda"] with 1 slot
## .. ..@ pointer:<externalptr>
## $ is_symmetric : logi TRUE
## $ sparsity : num 0.0763
## $ min_neighbors : int 1
## $ max_neighbors : int 8
## $ num_obs : int 61
## $ mean_neighbors : num 4.66
## $ median_neighbors: num 5
## $ has_isolates : logi FALSE
## and 27 methods, of which 13 are possibly relevant:
## GetNeighbors, GetNeighborSize, GetNeighborWeights, GetPointer, GetSparsity,
## HasIsolates, initialize, IsSymmetric, SaveToFile, SetNeighbors,
## SetNeighborsAndWeights, SpatialLag, Update
## [1] 11 20 42 45 46 47
## [1] "Albany"
## [1] "Columbia" "Greene" "Rensselaer" "Saratoga" "Schenectady"
## [6] "Schoharie"
## [1] 5 26 50 60
## [1] "Allegany"
## [1] "Cattaraugus" "Livingston" "Steuben" "Wyoming"
## [1] 1 1 1 1 1 1
## [1] 1 1 1 1
Now you can use your ‘geoda’ ‘Weights’ to calculate the Local Moran statistic at each polygon.
# We will coerce our data variable into a one-column data frame because this is the format required by the local_moran function
log_lyme_df <- as.data.frame(lyme$log_lyme_incidence)
# Now we can run the local_moran function
lyme_lisa <- local_moran(lyme_gweights, log_lyme_df)
# local_moran returns a LISA object
class(lyme_lisa)## [1] "LISA"
## attr(,"package")
## [1] "rgeoda"
# Let's take a closer look at this LISA object
lyme_lisa$lisa_vals # View local Moran's I values for each polygon## [1] 6.303589e-01 1.106106e+00 1.414135e+00 4.771674e-01 1.205222e+00
## [6] 6.955438e-02 2.297059e+00 3.348814e-01 3.241632e-01 -3.410545e-02
## [11] 2.786406e+00 -1.137256e-02 7.590118e-01 1.810965e+00 2.370781e+00
## [16] 1.539095e-01 -7.265369e-02 -3.917429e-02 1.341750e+00 2.403365e+00
## [21] -5.327627e-02 4.522845e-03 -2.021608e-02 6.452192e-01 -3.709835e-05
## [26] 1.369921e+00 -5.424938e-02 8.254362e-01 -1.846812e-03 9.200084e-01
## [31] 6.151358e-01 2.915899e+00 -6.473286e-02 1.193171e-01 2.607967e-01
## [36] 1.225206e+00 1.797554e+00 -4.994189e-02 2.284720e-01 1.108447e+00
## [41] 1.366207e+00 2.002465e+00 4.258473e-01 4.793295e-01 3.220550e-01
## [46] 1.858658e-01 5.146906e-01 2.360824e-01 4.121139e-03 -3.754591e-02
## [51] -1.523086e-01 9.750675e-01 5.005027e-01 1.171338e-01 1.861588e+00
## [56] 3.084596e-01 8.629906e-01 4.824627e-01 1.943183e-02 1.642308e+00
## [61] -2.581274e-03
## [1] 0.001 0.019 0.046 0.129 0.001 0.425 0.048 0.097 0.156 0.338 0.001 0.222
## [13] 0.002 0.001 0.001 0.309 0.389 0.399 0.001 0.001 0.220 0.438 0.335 0.102
## [25] 0.496 0.024 0.306 0.007 0.148 0.059 0.006 0.010 0.315 0.284 0.085 0.003
## [37] 0.010 0.217 0.261 0.085 0.014 0.001 0.047 0.065 0.099 0.169 0.022 0.242
## [49] 0.482 0.248 0.157 0.020 0.121 0.350 0.002 0.105 0.018 0.096 0.335 0.002
## [61] 0.493
Finally, we can make a map of our results! ‘rgeoda’ includes some nifty functions (‘map_colors’, ‘map_labels’ and ‘map_clusters’ to help us with our mapping).
map_colors <- lisa_colors(lyme_lisa)
map_labels <- lisa_labels(lyme_lisa)
map_clusters <- lisa_clusters(lyme_lisa)
plot(st_geometry(lyme),
col=sapply(map_clusters, function(x){return(map_colors[[x+1]])}),
border = "#333333", lwd=0.2)
legend('topright', legend = map_labels, fill = map_colors, border = "#eeeeee", cex = 0.7)The ongoing conflict between Russia and Ukraine has caused significant economic disruptions, particularly through sanctions and changes in trade patterns. One way to analyze these disruptions is through Input-Output (I-O) analysis, a method that models the interdependencies between sectors in an economy. By examining output multipliers, we can understand how changes in one sector’s demand affect the entire economy.
The core formula used in I-O analysis to compute total output and output multipliers is derived from the Leontief inverse matrix. The formula is as follows:
\[ \mathbf{X} = (\mathbf{I} - \mathbf{A})^{-1} \mathbf{F} \]
Where: - \(\mathbf{X}\) is the output vector, representing total output for each sector. - \(\mathbf{I}\) is the identity matrix. - \(\mathbf{A}\) is the direct requirements matrix (or technical coefficient matrix), representing the inputs required from other sectors to produce one unit of output. - \((\mathbf{I} - \mathbf{A})^{-1}\) is the Leontief inverse matrix, capturing the total intersectoral dependencies. - \(\mathbf{F}\) is the final demand vector, representing demand for goods and services from external sources (e.g., households, exports).
The elements of the direct requirements matrix \(\mathbf{A}\) are calculated as:
\[ A_{ij} = \frac{Z_{ij}}{X_j} \]
Where: - \(Z_{ij}\) is the amount of output from sector \(i\) used as input by sector \(j\). - \(X_j\) is the total output of sector \(j\).
The Leontief inverse matrix accounts for both direct and indirect effects of changes in final demand. It shows how changes in one sector affect the entire economy due to intersectoral linkages.
The output multipliers for sector \(j\) represent the total increase in output across all sectors from a unit increase in the final demand of sector \(j\). These multipliers are computed by summing the column elements of the Leontief inverse matrix:
\[ \text{Output Multiplier for Sector } j = \sum_{i} (\mathbf{I} - \mathbf{A})^{-1}_{ij} \]
This formula helps quantify how changes in demand for products or services in one sector lead to changes in the overall economic output, showing the ripple effect through the supply chain.
# Load necessary libraries
library(tidyverse) # for data wrangling
library(tmaptools) # for thematic mapping tools
library(tmap) # for creating maps
library(ggplot2) # for data visualization
library(leaflet) # for interactive maps
library(sf) # for handling spatial data
library(dplyr) # for data manipulation
library(raster) # for working with raster dataNote: that the output-output multipliers has already been computed for all economies.
# Load the shapefile for countries
auxmap <- st_read("data/Countries_SHP1_3mil/CNTR_RG_03M_2016_3035.shp",
stringsAsFactors = FALSE)## Reading layer `CNTR_RG_03M_2016_3035' from data source
## `/Users/tomasoles/Library/CloudStorage/Dropbox/Teaching/AppliedDataAnalysis/applied_data_analysis/3_week/data/Countries_SHP1_3mil/CNTR_RG_03M_2016_3035.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 257 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: -7143416 ymin: -9160773 xmax: 16936900 ymax: 15428320
## Projected CRS: ETRS89-extended / LAEA Europe
## Classes 'sf' and 'data.frame': 257 obs. of 6 variables:
## $ CNTR_ID : chr "AD" "AE" "AF" "AG" ...
## $ CNTR_NAME: chr "Andorra" "الإمارات العربية المتحدة" "افغانستان-افغانستان" "Antigua and Barbuda" ...
## $ NAME_ENGL: chr "Andorra" "United Arab Emirates" "Afghanistan" "Antigua and Barbuda" ...
## $ ISO3_CODE: chr "AND" "ARE" "AFG" "ATG" ...
## $ FID : chr "AD" "AE" "AF" "AG" ...
## $ geometry :sfc_MULTIPOLYGON of length 257; first list element: List of 1
## ..$ :List of 1
## .. ..$ : num [1:35, 1:2] 3640254 3638125 3635523 3634181 3629280 ...
## ..- attr(*, "class")= chr [1:3] "XY" "MULTIPOLYGON" "sfg"
## - attr(*, "sf_column")= chr "geometry"
## - attr(*, "agr")= Factor w/ 3 levels "constant","aggregate",..: NA NA NA NA NA
## ..- attr(*, "names")= chr [1:5] "CNTR_ID" "CNTR_NAME" "NAME_ENGL" "ISO3_CODE" ...
# Create a choropleth map based on GDP fall using quantile style
auxQmap1 <- tm_shape(aux_map_data) +
tm_fill(col = "gGDPc",
style = "quantile",
n = 8,
legend.hist = TRUE,
palette = "YlOrRd",
title = "GDP Fall Due to Trade Sanctions on Russia",
alpha = 0.6) +
tm_layout(legend.outside = TRUE)
# Set tmap mode to view (interactive map mode)
tmap_mode("view")
# Display the map with adjusted view
auxQmap1 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)# Create a choropleth map using hierarchical clustering style for a different view
auxQmap2 <- tm_shape(aux_map_data) +
tm_fill(col = "gGDPc",
style = "hclust",
n = 8,
legend.hist = TRUE,
palette = "YlOrRd",
title = "GDP Fall Due to Trade Sanctions on Russia",
alpha = 0.6) +
tm_layout(legend.outside = TRUE)
# Set tmap mode to view (interactive map mode)
tmap_mode("view")
# Display the map with hierarchical clustering
auxQmap2 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)# Create a map for trade in intermediate products and its impact on GDP
auxQmap3 <- tm_shape(aux_map_data) +
tm_fill(col = "gGDPc_A",
style = "hclust",
n = 8,
legend.hist = TRUE,
palette = "YlOrRd",
title = "GDP Impact Due to Trade Sanctions in Intermediate Products",
alpha = 0.6) +
tm_layout(legend.outside = TRUE)
# Display the map in interactive view
tmap_mode("view")
auxQmap3 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)# Create a map for trade in intermediate products and its impact on GDP
auxQmap3 <- tm_shape(aux_map_data) +
tm_fill(col = "gGDPc_A",
style = "hclust",
n = 8,
legend.hist = TRUE,
palette = "YlOrRd",
title = "GDP Impact Due to Trade Sanctions in Intermediate Products",
alpha = 0.6) +
tm_layout(legend.outside = TRUE)
# Display the map in interactive view
tmap_mode("view")
auxQmap3 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)# Create a map for trade in final products and its impact on GDP
auxQmap4 <- tm_shape(aux_map_data) +
tm_fill(col = "gGDPc_Y",
style = "hclust",
n = 8,
legend.hist = TRUE,
palette = "YlOrRd",
title = "GDP Impact Due to Trade Sanctions in Final Products",
alpha = 0.6) +
tm_layout(legend.outside = TRUE)
# Display the map in interactive view
tmap_mode("view")
auxQmap4 + tm_view(set.view = c(7, 51, 4), text.size.variable = TRUE)library(maps)
library(countrycode)
library(gapminder)
dat_map <- map_data("world")
ggplot(dat_map, aes(x = long, y = lat,
group = group)) +
geom_polygon(fill = "white", colour = "black")#Plot the first shot
gdp_map <- full_join(x=dat_map,y=gapminder, by= c("region"="country"))
ggplot(gdp_map, aes(x = long, y = lat,
group = group, fill = log10(gdpPercap))) +
geom_polygon() +
scale_fill_gradient(low = "red", high = "green")#Too much countries are missing
gapminder$ccode <- countrycode(gapminder$country,
origin = "country.name",
destination = "wb")
dat_map$ccode <- countrycode(dat_map$region,
origin = "country.name",
destination = "wb")
gdp_map_codes <- full_join(x=dat_map,y=gapminder, by="ccode")
#Plot again after merging
ggplot(gdp_map_codes, aes(x = long, y = lat,
group = group, fill = log10(gdpPercap))) +
geom_polygon() +
scale_fill_gradient(low = "red", high = "green")library(tidyverse)
library(gapminder)
library(echarts4r)
library(gganimate)
library(ggiraph)
library(widgetframe)
library(ggthemes)
library(plotly)
library(viridis)
library(DT)
# country codes in gapminder::country_codes
gapminder_codes <- gapminder::country_codes
# countries with info in gapminder::gapminder_unfiltered
gapminder <-gapminder::gapminder_unfiltered
# We join both datasets with inner_join to get a dataset with the info by country, continent and country-code
gapminder <- gapminder %>%
inner_join(gapminder_codes, by= "country") %>%
mutate(code = iso_alpha)
# A map of the world (Antarctica removed)
world <- map_data("world") %>%
filter(region != "Antarctica")
gapminder_data <- gapminder %>%
inner_join(maps::iso3166 %>%
select(a3, mapname), by= c(code = "a3")) %>%
mutate(mapname = str_remove(mapname, "\\(.*"))
datagpmnd <- gapminder %>%
mutate(Name = recode_factor(country,
`Congo, Dem. Rep.`= "Dem. Rep. Congo",
`Congo, Rep.`= "Congo",
`Cote d'Ivoire`= "Côte d'Ivoire",
`Central African Republic`= "Central African Rep.",
`Yemen, Rep.`= "Yemen",
`Korea, Rep.`= "Korea",
`Korea, Dem. Rep.`= "Dem. Rep. Korea",
`Czech Republic`= "Czech Rep.",
`Slovak Republic`= "Slovakia",
`Dominican Republic`= "Dominican Rep.",
`Equatorial Guinea`= "Eq. Guinea"))
datagpmnd %>%
group_by(year) %>%
e_chart(Name, timeline = TRUE) %>%
e_map(lifeExp) %>%
e_visual_map(min= 30, max= 90,
type = 'piecewise') %>%
e_title("Life expectancy by country and year", left = "center") %>%
e_tooltip(
trigger = "item",
formatter = e_tooltip_choro_formatter())